Sys.setlocale('LC_ALL', 'C')
[1] "LC_CTYPE=C;LC_NUMERIC=C;LC_TIME=C;LC_COLLATE=C;LC_MONETARY=C;LC_MESSAGES=en_US.UTF-8;LC_PAPER=vi_VN;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=vi_VN;LC_IDENTIFICATION=C"
library(fpp2)
library(gam)
library(prophet)
library (gbm)
library(viridis)
Loading required package: viridisLite
setwd('/home/hieu/BDMA/TSA')
?arrivals
autoplot(arrivals)

autoplot(arrivals[,c(1,2)])


Japan<- arrivals[,1]
NZ<- arrivals[,2]
UK<- arrivals[,3]
US<- arrivals[,4]

Linear regression

tt<- (1:length(NZ))
seas <- factor(c(rep(1:4,length(NZ)/4),1:3)) 
#####1:3 because there are three observations 'out' 
mod2 <- lm(NZ~ tt+seas)
summary(mod2)
AIC(mod2)

Linear model with no seasonality

mod2a <- lm(NZ~ tt)
summary(mod2a)

we now try with a GAM

# Values for df should be greater than 1, with df=1 implying a linear fit. Default is df=4
g1 <- gam(NZ~s(tt)+seas)
summary(g1)

Call: gam(formula = NZ ~ s(tt) + seas)
Deviance Residuals:
    Min      1Q  Median      3Q     Max 
-38.576 -11.971  -1.408  11.954  50.091 

(Dispersion Parameter for gaussian family taken to be 307.5263)

    Null Deviance: 903510.7 on 126 degrees of freedom
Residual Deviance: 36595.57 on 118.9998 degrees of freedom
AIC: 1097.675 

Number of Local Scoring Iterations: NA 

Anova for Parametric Effects
           Df Sum Sq Mean Sq  F value    Pr(>F)    
s(tt)       1 783533  783533 2547.856 < 2.2e-16 ***
seas        3  70207   23402   76.098 < 2.2e-16 ***
Residuals 119  36596     308                       
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Anova for Nonparametric Effects
            Npar Df Npar F     Pr(F)    
(Intercept)                             
s(tt)             3 15.121 2.121e-08 ***
seas                                    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(1,2))
plot(g1, se=T)
AIC(g1)
[1] 1097.675
# time has a nonlinear effect
g1a <- gam(NZ~s(tt))
par(mfrow=c(1,2))

plot(g1a, se=T)
summary(g1a)

Call: gam(formula = NZ ~ s(tt))
Deviance Residuals:
    Min      1Q  Median      3Q     Max 
-74.573 -18.713   4.153  20.611  76.733 

(Dispersion Parameter for gaussian family taken to be 876.4429)

    Null Deviance: 903510.7 on 126 degrees of freedom
Residual Deviance: 106925.9 on 121.9998 degrees of freedom
AIC: 1227.845 

Number of Local Scoring Iterations: NA 

Anova for Parametric Effects
           Df Sum Sq Mean Sq F value    Pr(>F)    
s(tt)       1 783533  783533  893.99 < 2.2e-16 ***
Residuals 122 106926     876                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Anova for Nonparametric Effects
            Npar Df Npar F    Pr(F)   
(Intercept)                           
s(tt)             3 4.9637 0.002762 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# linear model may be also performed with library(gam)
g0 <- gam(NZ~(tt)+seas)
summary(g0)

Call: gam(formula = NZ ~ (tt) + seas)
Deviance Residuals:
    Min      1Q  Median      3Q     Max 
-37.777 -15.263  -2.644  12.825  53.704 

(Dispersion Parameter for gaussian family taken to be 414.3167)

    Null Deviance: 903510.7 on 126 degrees of freedom
Residual Deviance: 50546.63 on 122 degrees of freedom
AIC: 1132.691 

Number of Local Scoring Iterations: 2 

Anova for Parametric Effects
           Df Sum Sq Mean Sq F value    Pr(>F)    
tt          1 783533  783533 1891.14 < 2.2e-16 ***
seas        3  69431   23144   55.86 < 2.2e-16 ***
Residuals 122  50547     414                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(1,2))

plot(g0, se=T)

AIC(g0)
[1] 1132.691

GAM with splines performs better

# try another option with loess (lo)
g2<- gam(NZ~lo(tt)+seas)
summary(g2)

Call: gam(formula = NZ ~ lo(tt) + seas)
Deviance Residuals:
     Min       1Q   Median       3Q      Max 
-39.3138 -12.3064  -0.8082  11.4180  50.4500 

(Dispersion Parameter for gaussian family taken to be 329.8333)

    Null Deviance: 903510.7 on 126 degrees of freedom
Residual Deviance: 39463.9 on 119.648 degrees of freedom
AIC: 1105.962 

Number of Local Scoring Iterations: NA 

Anova for Parametric Effects
              Df Sum Sq Mean Sq  F value    Pr(>F)    
lo(tt)      1.00 783533  783533 2375.541 < 2.2e-16 ***
seas        3.00  69943   23314   70.686 < 2.2e-16 ***
Residuals 119.65  39464     330                       
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Anova for Nonparametric Effects
            Npar Df Npar F     Pr(F)    
(Intercept)                             
lo(tt)          2.4 14.286 6.383e-07 ***
seas                                    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(1,2))
plot(g2, se=T)

AIC(g2)
[1] 1105.962
# perform analysis of residuals
tsdisplay(residuals(g1))

aar1<- auto.arima(residuals(g1))

plot(as.numeric(NZ), type="l")
lines(fitted(g1), col=3)
lines(fitted(aar1)+ fitted(g1), col=4)


# Exercise: try also with the other variables 

Prophet

iPhone sales

# data 
iphone=read.csv("iphone.csv", sep=";")
str(iphone)
'data.frame':   46 obs. of  3 variables:
 $ iphone : num  0.27 1.12 2.32 1.7 0.72 6.89 4.36 3.79 5.21 7.37 ...
 $ quarter: chr  "01/10/2007" "31/12/2007" "31/03/2008" "01/07/2008" ...
 $ cap    : int  100 100 100 100 100 100 100 100 100 100 ...
# some preliminary analysis on the data
iphone$quarter=as.Date(iphone$quarter, format="%d/%m/%Y")
str(iphone)
'data.frame':   46 obs. of  3 variables:
 $ iphone : num  0.27 1.12 2.32 1.7 0.72 6.89 4.36 3.79 5.21 7.37 ...
 $ quarter: Date, format: "2007-10-01" "2007-12-31" "2008-03-31" ...
 $ cap    : int  100 100 100 100 100 100 100 100 100 100 ...
colnames(iphone)=c("y","ds","cap")

summary(iphone$y) ##to explain why cap is 100
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.270   8.742  35.130  31.916  47.370  78.290 
#plot
plot(iphone$y, type="l", x=iphone$ds, ylab="", xlab="Year") 

# nonlinear trend 
# seasonality
# Prophet model
# model with no seasonality
# m2=prophet(iphone, yearly.seasonality=F, daily.seasonality=F, weekly.seasonality = F, growth="logistic", n.changepoints=15)


# model with automatic selection of seasonality (default is "auto")
# in this case just yearly seasonality, if daily and weekly seasonsonality are needed we need to specify
# changepoints not specified, automatically set
m2=prophet(iphone,  growth="logistic") 
Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
# n.changepoints default number is 25
summary(m2)
                        Length Class      Mode     
growth                   1     -none-     character
changepoints            25     POSIXct    numeric  
n.changepoints           1     -none-     numeric  
changepoint.range        1     -none-     numeric  
yearly.seasonality       1     -none-     character
weekly.seasonality       1     -none-     character
daily.seasonality        1     -none-     character
holidays                 0     -none-     NULL     
seasonality.mode         1     -none-     character
seasonality.prior.scale  1     -none-     numeric  
changepoint.prior.scale  1     -none-     numeric  
holidays.prior.scale     1     -none-     numeric  
mcmc.samples             1     -none-     numeric  
interval.width           1     -none-     numeric  
uncertainty.samples      1     -none-     numeric  
specified.changepoints   1     -none-     logical  
start                    1     POSIXct    numeric  
y.scale                  1     -none-     numeric  
logistic.floor           1     -none-     logical  
t.scale                  1     -none-     numeric  
changepoints.t          25     -none-     numeric  
seasonalities            1     -none-     list     
extra_regressors         0     -none-     list     
country_holidays         0     -none-     NULL     
stan.fit                 4     -none-     list     
params                   6     -none-     list     
history                  7     data.frame list     
history.dates           46     POSIXct    numeric  
train.holiday.names      0     -none-     NULL     
train.component.cols     3     data.frame list     
component.modes          2     -none-     list     
fit.kwargs               0     -none-     list     
# create a future 'window' for prediction
future2 <- make_future_dataframe(m2, periods = 20, freq="quarter", include_history = T)
tail(future2)
future2$cap=100

forecast2 <- predict(m2, future2)
tail(forecast2[c("ds", "yhat", "yhat_lower", "yhat_upper")])

# prediction plot
plot(m2, forecast2)


# Dynamic plot
dyplot.prophet(m2, forecast2) 

# plot with change points
plot(m2, forecast2)+add_changepoints_to_plot(m2, threshold=0)

# dates corresponding to change points
m2$changepoints
[1] 0
# Prophet with no change points and multiplicative seasonality 
m2=prophet(iphone,  growth="logistic", n.changepoints=0, seasonality.mode='multiplicative')
Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
# summary(m2)
# m2$seasonalities 
future2 <- make_future_dataframe(m2, periods = 20, freq="quarter", include_history = T)
tail(future2)
future2$cap=100

forecast2 <- predict(m2, future2)
tail(forecast2[c("ds", "yhat", "yhat_lower", "yhat_upper")])

# prediction plot
plot(m2, forecast2)

dyplot.prophet(m2, forecast2) 

Generalized Additive Models

‘Movies’ Dataset

data <- read.csv("movies.csv", stringsAsFactors=TRUE)
str(data)
'data.frame':   3229 obs. of  27 variables:
 $ X.1                 : int  1 2 3 4 5 6 7 8 9 10 ...
 $ X                   : int  1 2 3 4 5 6 7 8 9 10 ...
 $ budget              : int  237000000 300000000 245000000 250000000 260000000 258000000 260000000 280000000 250000000 250000000 ...
 $ popularity          : num  1.50e+08 1.39e+08 1.07e+08 1.12e+07 4.39e+07 ...
 $ production_countries: int  1 1 1 1 1 1 1 1 1 1 ...
 $ release_date        : Factor w/ 2494 levels "01/01/1961","01/01/1969",..: 785 1552 2190 1304 470 40 2005 1792 499 1874 ...
 $ revenue             : num  2.79e+09 9.61e+08 8.81e+08 1.08e+09 2.84e+08 ...
 $ runtime             : int  162 169 148 165 132 139 100 141 153 151 ...
 $ spoken_languages    : int  1 1 1 1 1 1 1 1 1 1 ...
 $ vote_average        : num  7.2 6.9 6.3 7.6 6.1 5.9 7.4 7.3 7.4 5.7 ...
 $ vote_classes        : Factor w/ 5 levels "0 a 5","5 a 6",..: 4 3 3 4 3 2 4 4 4 2 ...
 $ Comp_Universal      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Comp_Paramount      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Comp_Warner         : int  0 0 0 1 0 0 0 0 1 1 ...
 $ Comp_20fox          : int  1 0 0 0 0 0 0 0 0 0 ...
 $ Comp_Columbia       : int  0 0 1 0 0 1 0 0 0 0 ...
 $ Comp_NewLine        : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Comp_Disney         : int  0 1 0 0 1 0 1 0 0 0 ...
 $ Comp_VillageRoadshow: int  0 0 0 0 0 0 0 0 0 0 ...
 $ Comp_Miramax        : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Comp_Pixar          : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Comp_RelMedia       : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Comp_MGM            : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Comp_DreamWorks     : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Comp_Touchstone     : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Comp_CanalPlus      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ genere              : Factor w/ 19 levels "Action","Adventure",..: 9 2 1 1 1 1 3 1 9 1 ...
# erase columns of indicator variables
data<-data[,-c(1,2)]

# transform variable release_date in format "data"
data$release_date <- as.Date(data$release_date, "%d/%m/%Y")

str(data)
'data.frame':   3229 obs. of  25 variables:
 $ budget              : int  237000000 300000000 245000000 250000000 260000000 258000000 260000000 280000000 250000000 250000000 ...
 $ popularity          : num  1.50e+08 1.39e+08 1.07e+08 1.12e+07 4.39e+07 ...
 $ production_countries: int  1 1 1 1 1 1 1 1 1 1 ...
 $ release_date        : Date, format: "2009-12-10" "2007-05-19" "2015-10-26" ...
 $ revenue             : num  2.79e+09 9.61e+08 8.81e+08 1.08e+09 2.84e+08 ...
 $ runtime             : int  162 169 148 165 132 139 100 141 153 151 ...
 $ spoken_languages    : int  1 1 1 1 1 1 1 1 1 1 ...
 $ vote_average        : num  7.2 6.9 6.3 7.6 6.1 5.9 7.4 7.3 7.4 5.7 ...
 $ vote_classes        : Factor w/ 5 levels "0 a 5","5 a 6",..: 4 3 3 4 3 2 4 4 4 2 ...
 $ Comp_Universal      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Comp_Paramount      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Comp_Warner         : int  0 0 0 1 0 0 0 0 1 1 ...
 $ Comp_20fox          : int  1 0 0 0 0 0 0 0 0 0 ...
 $ Comp_Columbia       : int  0 0 1 0 0 1 0 0 0 0 ...
 $ Comp_NewLine        : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Comp_Disney         : int  0 1 0 0 1 0 1 0 0 0 ...
 $ Comp_VillageRoadshow: int  0 0 0 0 0 0 0 0 0 0 ...
 $ Comp_Miramax        : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Comp_Pixar          : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Comp_RelMedia       : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Comp_MGM            : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Comp_DreamWorks     : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Comp_Touchstone     : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Comp_CanalPlus      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ genere              : Factor w/ 19 levels "Action","Adventure",..: 9 2 1 1 1 1 3 1 9 1 ...
# Response variable
summary(data$vote_average)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   5.800   6.300   6.309   6.900   8.500 
#boxplot(data$vote_average, col="orange", ylim=c(0,10), main="Movies", ylab="Rating")
hist(data$vote_average, col="orange", main="Movies", xlab="Rating")

# explanatory variables
summary(data)
     budget            popularity        production_countries  release_date           revenue         
 Min.   :        1   Min.   :        0   Min.   :0.0000       Min.   :1916-09-04   Min.   :5.000e+00  
 1st Qu.: 10500000   1st Qu.:  7447714   1st Qu.:1.0000       1st Qu.:1998-09-11   1st Qu.:1.700e+07  
 Median : 25000000   Median : 18256028   Median :1.0000       Median :2005-07-20   Median :5.518e+07  
 Mean   : 40654445   Mean   : 26572600   Mean   :0.9006       Mean   :2002-03-21   Mean   :1.212e+08  
 3rd Qu.: 55000000   3rd Qu.: 35307577   3rd Qu.:1.0000       3rd Qu.:2010-11-12   3rd Qu.:1.463e+08  
 Max.   :380000000   Max.   :875581305   Max.   :1.0000       Max.   :2016-09-09   Max.   :2.788e+09  
                                                                                                      
    runtime      spoken_languages  vote_average   vote_classes  Comp_Universal    Comp_Paramount   
 Min.   : 41.0   Min.   :0.0000   Min.   :0.000   0 a 5 : 192   Min.   :0.00000   Min.   :0.00000  
 1st Qu.: 96.0   1st Qu.:1.0000   1st Qu.:5.800   5 a 6 : 843   1st Qu.:0.00000   1st Qu.:0.00000  
 Median :107.0   Median :1.0000   Median :6.300   6 a 7 :1426   Median :0.00000   Median :0.00000  
 Mean   :110.7   Mean   :0.9709   Mean   :6.309   7 a 8 : 706   Mean   :0.08826   Mean   :0.08083  
 3rd Qu.:121.0   3rd Qu.:1.0000   3rd Qu.:6.900   8 a 10:  62   3rd Qu.:0.00000   3rd Qu.:0.00000  
 Max.   :338.0   Max.   :1.0000   Max.   :8.500                 Max.   :1.00000   Max.   :1.00000  
                                                                                                   
  Comp_Warner        Comp_20fox      Comp_Columbia      Comp_NewLine      Comp_Disney     
 Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
 1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
 Median :0.00000   Median :0.00000   Median :0.00000   Median :0.00000   Median :0.00000  
 Mean   :0.09291   Mean   :0.06473   Mean   :0.08176   Mean   :0.04429   Mean   :0.03685  
 3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
 Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
                                                                                          
 Comp_VillageRoadshow  Comp_Miramax       Comp_Pixar       Comp_RelMedia        Comp_MGM     
 Min.   :0.00000      Min.   :0.00000   Min.   :0.000000   Min.   :0.00000   Min.   :0.0000  
 1st Qu.:0.00000      1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.0000  
 Median :0.00000      Median :0.00000   Median :0.000000   Median :0.00000   Median :0.0000  
 Mean   :0.02261      Mean   :0.02323   Mean   :0.004955   Mean   :0.03097   Mean   :0.0288  
 3rd Qu.:0.00000      3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:0.00000   3rd Qu.:0.0000  
 Max.   :1.00000      Max.   :1.00000   Max.   :1.000000   Max.   :1.00000   Max.   :1.0000  
                                                                                             
 Comp_DreamWorks   Comp_Touchstone  Comp_CanalPlus          genere   
 Min.   :0.00000   Min.   :0.0000   Min.   :0.00000   Action   :557  
 1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000   Drama    :556  
 Median :0.00000   Median :0.0000   Median :0.00000   Comedy   :487  
 Mean   :0.03066   Mean   :0.0288   Mean   :0.03345   Thriller :253  
 3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000   Horror   :195  
 Max.   :1.00000   Max.   :1.0000   Max.   :1.00000   Animation:187  
                                                      (Other)  :994  
summary(data[,c(1,2,5,6)])
     budget            popularity           revenue             runtime     
 Min.   :        1   Min.   :        0   Min.   :5.000e+00   Min.   : 41.0  
 1st Qu.: 10500000   1st Qu.:  7447714   1st Qu.:1.700e+07   1st Qu.: 96.0  
 Median : 25000000   Median : 18256028   Median :5.518e+07   Median :107.0  
 Mean   : 40654445   Mean   : 26572600   Mean   :1.212e+08   Mean   :110.7  
 3rd Qu.: 55000000   3rd Qu.: 35307577   3rd Qu.:1.463e+08   3rd Qu.:121.0  
 Max.   :380000000   Max.   :875581305   Max.   :2.788e+09   Max.   :338.0  
par(mfrow=c(2,2))
for(i in c(1,2,5,6)){
  hist(data[,i], col="orange", main=paste(colnames(data)[i]), xlab="")
}


#transform quantitative variables in log scale
data$budget <- log(data$budget)
data$popularity <- log(data$popularity)
data$revenue <- log(data$revenue)

summary(data[,c(1,2,5,6)])
     budget        popularity        revenue          runtime     
 Min.   : 0.00   Min.   :-3.913   Min.   : 1.609   Min.   : 41.0  
 1st Qu.:16.17   1st Qu.:15.823   1st Qu.:16.649   1st Qu.: 96.0  
 Median :17.03   Median :16.720   Median :17.826   Median :107.0  
 Mean   :16.80   Mean   :16.213   Mean   :17.491   Mean   :110.7  
 3rd Qu.:17.82   3rd Qu.:17.380   3rd Qu.:18.801   3rd Qu.:121.0  
 Max.   :19.76   Max.   :20.590   Max.   :21.749   Max.   :338.0  
par(mfrow=c(2,2))
for(i in c(1,2,5,6)){
  hist(data[,i], col="orange", main=paste(colnames(data)[i]), xlab="")
}
#go back to orginal panel 
par(mfrow=c(1,1))


#transform release_date in numeric 
data$release_date<-as.numeric(data$release_date)
summary(data$release_date)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 -19477   10480   12984   11767   14925   17053 
#as.numeric(as.Date("1970-01-01"))


# Set train and test
set.seed(1)
train = sample (1:nrow(data), 0.7*nrow(data))
data.train=data[train ,]
data.test=data[-train ,]

# make some variables factor
data.train[,c(3,7, 10:24)]= lapply(data.train[,c(3,7, 10:24)],factor)
data.test[,c(3,7, 10:24)]= lapply(data.test[,c(3,7, 10:24)],factor)

str(data.train)
'data.frame':   2260 obs. of  25 variables:
 $ budget              : num  17.6 17.9 16.5 17.7 17.1 ...
 $ popularity          : num  15.1 16.8 16.7 15 16.6 ...
 $ production_countries: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 1 1 2 ...
 $ release_date        : num  9486 13138 16912 10172 13719 ...
 $ revenue             : num  16.4 18.7 16.8 16.2 18.3 ...
 $ runtime             : int  192 94 94 114 104 106 89 104 93 105 ...
 $ spoken_languages    : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
 $ vote_average        : num  7.1 5.7 6 5.9 6.1 5.7 5.5 3.7 4.3 5.4 ...
 $ vote_classes        : Factor w/ 5 levels "0 a 5","5 a 6",..: 4 2 3 2 3 2 2 1 1 2 ...
 $ Comp_Universal      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ Comp_Paramount      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ Comp_Warner         : Factor w/ 2 levels "0","1": 1 1 1 2 2 1 1 1 1 1 ...
 $ Comp_20fox          : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
 $ Comp_Columbia       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ Comp_NewLine        : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 2 1 2 ...
 $ Comp_Disney         : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ Comp_VillageRoadshow: Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 1 ...
 $ Comp_Miramax        : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ Comp_Pixar          : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ Comp_RelMedia       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ Comp_MGM            : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ Comp_DreamWorks     : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ Comp_Touchstone     : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ Comp_CanalPlus      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ genere              : Factor w/ 19 levels "Action","Adventure",..: 10 4 1 17 4 9 11 4 4 4 ...

Linear Model

m1 <- lm(vote_average~.-vote_classes, data=data.train)

summary(m1)

# Stepwise Regression
m2 <- step(m1, direction='both')
summary(m2)

#Prediction
p.lm <- predict(m2, newdata=data.test)
dev.lm <- sum((p.lm-data.test$vote_average)^2)
dev.lm

AIC(m2)

GAM

# Stepwise GAM
# Start with a linear model (df=1)
g3 <- gam(vote_average~.-vote_classes, data=data.train)

# Show the linear effects 
par(mfrow=c(3,5))
plot(g3, se=T) 


# Perform stepwise selection using gam scope
# Values for df should be greater than 1, with df=1 implying a linear fit

sc = gam.scope(data.train[,-8], response=8, arg=c("df=2","df=3","df=4"))
g4<- step.Gam(g3, scope=sc, trace=T)
Start:  vote_average ~ . - vote_classes; AIC= 4768.761 
Step:1 vote_average ~ budget + s(popularity, df = 2) + production_countries +      release_date + revenue + runtime + spoken_languages + Comp_Universal +      Comp_Paramount + Comp_Warner + Comp_20fox + Comp_Columbia +      Comp_NewLine + Comp_Disney + Comp_VillageRoadshow + Comp_Miramax +      Comp_Pixar + Comp_RelMedia + Comp_MGM + Comp_DreamWorks +      Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4669.558 
Step:2 vote_average ~ budget + s(popularity, df = 3) + production_countries +      release_date + revenue + runtime + spoken_languages + Comp_Universal +      Comp_Paramount + Comp_Warner + Comp_20fox + Comp_Columbia +      Comp_NewLine + Comp_Disney + Comp_VillageRoadshow + Comp_Miramax +      Comp_Pixar + Comp_RelMedia + Comp_MGM + Comp_DreamWorks +      Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4613.909 
Step:3 vote_average ~ s(budget, df = 2) + s(popularity, df = 3) + production_countries +      release_date + revenue + runtime + spoken_languages + Comp_Universal +      Comp_Paramount + Comp_Warner + Comp_20fox + Comp_Columbia +      Comp_NewLine + Comp_Disney + Comp_VillageRoadshow + Comp_Miramax +      Comp_Pixar + Comp_RelMedia + Comp_MGM + Comp_DreamWorks +      Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4568.704 
Step:4 vote_average ~ s(budget, df = 2) + s(popularity, df = 3) + production_countries +      release_date + revenue + s(runtime, df = 2) + spoken_languages +      Comp_Universal + Comp_Paramount + Comp_Warner + Comp_20fox +      Comp_Columbia + Comp_NewLine + Comp_Disney + Comp_VillageRoadshow +      Comp_Miramax + Comp_Pixar + Comp_RelMedia + Comp_MGM + Comp_DreamWorks +      Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4524.893 
Step:5 vote_average ~ s(budget, df = 2) + s(popularity, df = 4) + production_countries +      release_date + revenue + s(runtime, df = 2) + spoken_languages +      Comp_Universal + Comp_Paramount + Comp_Warner + Comp_20fox +      Comp_Columbia + Comp_NewLine + Comp_Disney + Comp_VillageRoadshow +      Comp_Miramax + Comp_Pixar + Comp_RelMedia + Comp_MGM + Comp_DreamWorks +      Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4496.516 
Step:6 vote_average ~ s(budget, df = 3) + s(popularity, df = 4) + production_countries +      release_date + revenue + s(runtime, df = 2) + spoken_languages +      Comp_Universal + Comp_Paramount + Comp_Warner + Comp_20fox +      Comp_Columbia + Comp_NewLine + Comp_Disney + Comp_VillageRoadshow +      Comp_Miramax + Comp_Pixar + Comp_RelMedia + Comp_MGM + Comp_DreamWorks +      Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4484.31 
Step:7 vote_average ~ s(budget, df = 3) + s(popularity, df = 4) + production_countries +      s(release_date, df = 2) + revenue + s(runtime, df = 2) +      spoken_languages + Comp_Universal + Comp_Paramount + Comp_Warner +      Comp_20fox + Comp_Columbia + Comp_NewLine + Comp_Disney +      Comp_VillageRoadshow + Comp_Miramax + Comp_Pixar + Comp_RelMedia +      Comp_MGM + Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus +      genere ; AIC= 4472.868 
Step:8 vote_average ~ s(budget, df = 3) + s(popularity, df = 4) + production_countries +      s(release_date, df = 2) + revenue + s(runtime, df = 3) +      spoken_languages + Comp_Universal + Comp_Paramount + Comp_Warner +      Comp_20fox + Comp_Columbia + Comp_NewLine + Comp_Disney +      Comp_VillageRoadshow + Comp_Miramax + Comp_Pixar + Comp_RelMedia +      Comp_MGM + Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus +      genere ; AIC= 4463.158 
Step:9 vote_average ~ s(budget, df = 3) + s(popularity, df = 4) + production_countries +      s(release_date, df = 2) + s(revenue, df = 2) + s(runtime,      df = 3) + spoken_languages + Comp_Universal + Comp_Paramount +      Comp_Warner + Comp_20fox + Comp_Columbia + Comp_NewLine +      Comp_Disney + Comp_VillageRoadshow + Comp_Miramax + Comp_Pixar +      Comp_RelMedia + Comp_MGM + Comp_DreamWorks + Comp_Touchstone +      Comp_CanalPlus + genere ; AIC= 4460.939 
Step:10 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries +      s(release_date, df = 2) + s(revenue, df = 2) + s(runtime,      df = 3) + spoken_languages + Comp_Universal + Comp_Paramount +      Comp_Warner + Comp_20fox + Comp_Columbia + Comp_NewLine +      Comp_Disney + Comp_VillageRoadshow + Comp_Miramax + Comp_Pixar +      Comp_RelMedia + Comp_MGM + Comp_DreamWorks + Comp_Touchstone +      Comp_CanalPlus + genere ; AIC= 4458.619 
Step:11 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries +      s(release_date, df = 2) + s(revenue, df = 3) + s(runtime,      df = 3) + spoken_languages + Comp_Universal + Comp_Paramount +      Comp_Warner + Comp_20fox + Comp_Columbia + Comp_NewLine +      Comp_Disney + Comp_VillageRoadshow + Comp_Miramax + Comp_Pixar +      Comp_RelMedia + Comp_MGM + Comp_DreamWorks + Comp_Touchstone +      Comp_CanalPlus + genere ; AIC= 4456.526 
Step:12 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries +      s(release_date, df = 2) + s(revenue, df = 3) + s(runtime,      df = 3) + spoken_languages + Comp_Universal + Comp_Paramount +      Comp_Warner + Comp_20fox + Comp_Columbia + Comp_NewLine +      Comp_Disney + Comp_VillageRoadshow + Comp_Miramax + Comp_Pixar +      Comp_RelMedia + Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus +      genere ; AIC= 4454.492 
Step:13 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries +      s(release_date, df = 2) + s(revenue, df = 3) + s(runtime,      df = 3) + spoken_languages + Comp_Paramount + Comp_Warner +      Comp_20fox + Comp_Columbia + Comp_NewLine + Comp_Disney +      Comp_VillageRoadshow + Comp_Miramax + Comp_Pixar + Comp_RelMedia +      Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4452.51 
Step:14 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries +      s(release_date, df = 2) + s(revenue, df = 3) + s(runtime,      df = 3) + spoken_languages + Comp_Paramount + Comp_Warner +      Comp_20fox + Comp_Columbia + Comp_Disney + Comp_VillageRoadshow +      Comp_Miramax + Comp_Pixar + Comp_RelMedia + Comp_DreamWorks +      Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4450.537 
Step:15 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries +      s(release_date, df = 2) + s(revenue, df = 3) + s(runtime,      df = 3) + spoken_languages + Comp_Paramount + Comp_Warner +      Comp_20fox + Comp_Disney + Comp_VillageRoadshow + Comp_Miramax +      Comp_Pixar + Comp_RelMedia + Comp_DreamWorks + Comp_Touchstone +      Comp_CanalPlus + genere ; AIC= 4448.61 
Step:16 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries +      s(release_date, df = 2) + s(revenue, df = 3) + s(runtime,      df = 3) + spoken_languages + Comp_Paramount + Comp_Warner +      Comp_20fox + Comp_Disney + Comp_Miramax + Comp_Pixar + Comp_RelMedia +      Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4446.756 
Step:17 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries +      s(release_date, df = 2) + s(revenue, df = 3) + s(runtime,      df = 3) + spoken_languages + Comp_Paramount + Comp_Warner +      Comp_20fox + Comp_Disney + Comp_Miramax + Comp_Pixar + Comp_DreamWorks +      Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4444.949 
Step:18 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries +      s(release_date, df = 3) + s(revenue, df = 3) + s(runtime,      df = 3) + spoken_languages + Comp_Paramount + Comp_Warner +      Comp_20fox + Comp_Disney + Comp_Miramax + Comp_Pixar + Comp_DreamWorks +      Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4443.306 
Step:19 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries +      s(release_date, df = 3) + s(revenue, df = 3) + s(runtime,      df = 3) + spoken_languages + Comp_Paramount + Comp_Warner +      Comp_Disney + Comp_Miramax + Comp_Pixar + Comp_DreamWorks +      Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4441.657 
Step:20 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries +      s(release_date, df = 3) + s(revenue, df = 3) + s(runtime,      df = 3) + spoken_languages + Comp_Warner + Comp_Disney +      Comp_Miramax + Comp_Pixar + Comp_DreamWorks + Comp_Touchstone +      Comp_CanalPlus + genere ; AIC= 4440.046 
Step:21 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries +      s(release_date, df = 3) + s(revenue, df = 3) + s(runtime,      df = 3) + spoken_languages + Comp_Warner + Comp_Disney +      Comp_Miramax + Comp_Pixar + Comp_DreamWorks + Comp_CanalPlus +      genere ; AIC= 4438.899 
Step:22 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries +      s(release_date, df = 3) + s(revenue, df = 3) + s(runtime,      df = 4) + spoken_languages + Comp_Warner + Comp_Disney +      Comp_Miramax + Comp_Pixar + Comp_DreamWorks + Comp_CanalPlus +      genere ; AIC= 4437.876 
Step:23 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries +      s(release_date, df = 3) + s(revenue, df = 3) + s(runtime,      df = 4) + spoken_languages + Comp_Warner + Comp_Disney +      Comp_Pixar + Comp_DreamWorks + Comp_CanalPlus + genere ; AIC= 4436.856 
Step:24 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries +      s(release_date, df = 4) + s(revenue, df = 3) + s(runtime,      df = 4) + spoken_languages + Comp_Warner + Comp_Disney +      Comp_Pixar + Comp_DreamWorks + Comp_CanalPlus + genere ; AIC= 4436.163 
Step:25 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries +      s(release_date, df = 4) + s(revenue, df = 4) + s(runtime,      df = 4) + spoken_languages + Comp_Warner + Comp_Disney +      Comp_Pixar + Comp_DreamWorks + Comp_CanalPlus + genere ; AIC= 4435.783 
Step:26 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries +      s(release_date, df = 4) + s(revenue, df = 4) + s(runtime,      df = 4) + spoken_languages + Comp_Warner + Comp_Disney +      Comp_Pixar + Comp_DreamWorks + genere ; AIC= 4435.674 
Step:27 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries +      s(release_date, df = 4) + s(revenue, df = 4) + s(runtime,      df = 4) + spoken_languages + Comp_Warner + Comp_Disney +      Comp_DreamWorks + genere ; AIC= 4435.659 
summary(g4)

Call: gam(formula = vote_average ~ s(budget, df = 4) + s(popularity, 
    df = 4) + production_countries + s(release_date, df = 4) + 
    s(revenue, df = 4) + s(runtime, df = 4) + spoken_languages + 
    Comp_Warner + Comp_Disney + Comp_DreamWorks + genere, data = data.train, 
    trace = FALSE)
Deviance Residuals:
     Min       1Q   Median       3Q      Max 
-5.23538 -0.35948  0.02259  0.40028  3.51055 

(Dispersion Parameter for gaussian family taken to be 0.4085)

    Null Deviance: 1696.602 on 2259 degrees of freedom
Residual Deviance: 905.1506 on 2216 degrees of freedom
AIC: 4435.659 

Number of Local Scoring Iterations: NA 

Anova for Parametric Effects
                          Df Sum Sq Mean Sq  F value    Pr(>F)    
s(budget, df = 4)          1  61.03  61.028 149.4083 < 2.2e-16 ***
s(popularity, df = 4)      1 147.97 147.968 362.2558 < 2.2e-16 ***
production_countries       1  16.86  16.861  41.2797 1.610e-10 ***
s(release_date, df = 4)    1  64.05  64.054 156.8172 < 2.2e-16 ***
s(revenue, df = 4)         1  38.46  38.458  94.1523 < 2.2e-16 ***
s(runtime, df = 4)         1 189.85 189.849 464.7908 < 2.2e-16 ***
spoken_languages           1   1.47   1.466   3.5898  0.058267 .  
Comp_Warner                1   0.75   0.752   1.8402  0.175061    
Comp_Disney                1   4.31   4.314  10.5626  0.001171 ** 
Comp_DreamWorks            1  12.73  12.727  31.1583 2.669e-08 ***
genere                    18 115.34   6.408  15.6873 < 2.2e-16 ***
Residuals               2216 905.15   0.408                       
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Anova for Nonparametric Effects
                        Npar Df Npar F     Pr(F)    
(Intercept)                                         
s(budget, df = 4)             3 35.280 < 2.2e-16 ***
s(popularity, df = 4)         3 71.661 < 2.2e-16 ***
production_countries                                
s(release_date, df = 4)       3  6.294 0.0002999 ***
s(revenue, df = 4)            3  6.285 0.0003035 ***
s(runtime, df = 4)            3 21.216 1.503e-13 ***
spoken_languages                                    
Comp_Warner                                         
Comp_Disney                                         
Comp_DreamWorks                                     
genere                                              
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(g4)
[1] 4435.659
par(mfrow=c(3,5))
plot(g4, se=T)

# if we want to see better some plot
par(mfrow=c(1,1))

plot(g4, se=T, ask=T)
Make a plot selection (or 0 to exit):
 

 1: plot: s(budget, df = 4)         2: plot: s(popularity, df = 4)  
 3: plot: production_countries      4: plot: s(release_date, df = 4)
 5: plot: s(revenue, df = 4)        6: plot: s(runtime, df = 4)     
 7: plot: spoken_languages          8: plot: Comp_Warner            
 9: plot: Comp_Disney              10: plot: Comp_DreamWorks        
11: plot: genere                   12: plot all terms               
13: residuals on                   14: rug off                      
15: se off                         16: scale (0)                    

Gradient Boosting

Boosting-

boost.movies=gbm(vote_average ~ .-vote_classes, data=data.train, 
                 distribution="gaussian", n.trees=5000, interaction.depth=1)

# for the plot
par(mfrow=c(1,1))

# plot of training error
plot(boost.movies$train.error, type="l", ylab="training error")

# always decreasing with increasing number of trees
# relative influence plot
summary(boost.movies) 

# let us modify the graphical parameters to obtain a better plot 1re space on the left
# default vector of parameters
mai.old<-par()$mai
mai.old
[1] 1.02 0.82 0.82 0.42
# new vector
mai.new<-mai.old
# new space on the left
mai.new[2] <- 2.5 
mai.new
[1] 1.02 2.50 0.82 0.42
# modify graphical parameters
par(mai=mai.new)
summary(boost.movies, las=1) 

# las=1 horizontal names on y
summary(boost.movies, las=1, cBar=10) 
# cBar defines how many variables
# back to orginal window
par(mai=mai.old)

# test set prediction for every iteration (1:5000)
yhat.boost=predict(boost.movies, newdata=data.test, n.trees=1:5000)

# calculate the error for each iteration
# use 'apply' to perform a 'cycle for' 
# the first element is the matrix we want to use, 2 means 'by column', 
# and the third element indicates the function we want to calculate

err = apply(yhat.boost, 2, function(pred) mean((data.test$vote_average - pred)^2))
plot(err, type="l")


# error comparison (train e test)
plot(boost.movies$train.error, type="l")
lines(err, type="l", col=2)
# minimum error in test set
best=which.min(err)
abline(v=best, lty=2, col=4)

min(err) #minimum error
[1] 0.4598964

Boosting - Deeper trees

boost.movies=gbm(vote_average ~ .-vote_classes, data=data.train, 
                 distribution="gaussian", n.trees=5000, interaction.depth=4)

plot(boost.movies$train.error, type="l")

summary(boost.movies, las=1, cBar=10)  

yhat.boost=predict(boost.movies ,newdata=data.test,n.trees=1:5000)
err = apply(yhat.boost,2,function(pred) mean((data.test$vote_average-pred)^2))
plot(err, type="l")



plot(boost.movies$train.error, type="l")
lines(err, type="l", col=2)
best=which.min(err)
abline(v=best, lty=2, col=4)

min(err)
[1] 0.4327901

Boosting - Smaller learning rate

boost.movies=gbm(vote_average ~ .-vote_classes, data=data.train, 
                 distribution="gaussian", n.trees=5000, interaction.depth=1, shrinkage=0.01)
plot(boost.movies$train.error, type="l")

par(mai=mai.new)


summary(boost.movies, las=1, cBar=10) 
par(mai=mai.old)

yhat.boost=predict(boost.movies ,newdata=data.test,n.trees=1:5000)
err = apply(yhat.boost,2,function(pred) mean((data.test$vote_average-pred)^2))
plot(err, type="l")



plot(boost.movies$train.error, type="l")
lines(err, type="l", col=2)
best=which.min(err)
abline(v=best, lty=2, col=4)

min(err)
[1] 0.4568311

Boosting - combination of previous models

boost.movies=gbm(vote_average ~ .-vote_classes ,data=data.train, 
                 distribution="gaussian",n.trees=5000, interaction.depth=4, shrinkage=0.01)

plot(boost.movies$train.error, type="l")
#

par(mai=mai.new)


summary(boost.movies, las=1, cBar=10) 

par(mai=mai.old)

yhat.boost=predict(boost.movies ,newdata=data.test,n.trees=1:5000)
err = apply(yhat.boost, 2, function(pred) mean((data.test$vote_average-pred)^2))
plot(err, type="l")



plot(boost.movies$train.error, type="l")
lines(err, type="l", col=2)
best=which.min(err)
abline(v=best, lty=2, col=4)

err.boost= min(err)
# Comparison of models in terms of residual deviance
dev.gbm<- (sum((yhat.boost[,best]-data.test$vote_average)^2))
# dev.gbm
# dev.gam
# dev.lm

boost.movies
gbm(formula = vote_average ~ . - vote_classes, distribution = "gaussian", 
    data = data.train, n.trees = 5000, interaction.depth = 4, 
    shrinkage = 0.01)
A gradient boosted model with gaussian loss function.
5000 iterations were performed.
There were 23 predictors of which 22 had non-zero influence.
# partial dependence plots
plot(boost.movies, i.var=1, n.trees = best)

plot(boost.movies, i.var=2, n.trees = best)

plot(boost.movies, i.var=5, n.trees = best)

plot(boost.movies, i.var=c(1,5), n.trees = best) #bivariate (library(viridis) may be necessary)

#
plot(boost.movies, i.var=3, n.trees = best) # categorical

plot(boost.movies, i.var=6, n.trees = best)


plot(boost.movies, i=23, n.trees = best)# categorical

plot(boost.movies, i=17, n.trees = best) #no effect

---
title: "Lab 5: Generalized Additive Models, Prophet, Gradient Boosting"
output: html_notebook
---

```{r}
Sys.setlocale('LC_ALL', 'C')
library(fpp2)
library(gam)
library(prophet)
library (gbm)
library(viridis)
setwd('/home/hieu/BDMA/TSA')
```

```{r}
?arrivals
autoplot(arrivals)
autoplot(arrivals[,c(1,2)])

Japan<- arrivals[,1]
NZ<- arrivals[,2]
UK<- arrivals[,3]
US<- arrivals[,4]
```

# Linear regression

```{r}
tt<- (1:length(NZ))
seas <- factor(c(rep(1:4,length(NZ)/4),1:3)) 
#####1:3 because there are three observations 'out' 
mod2 <- lm(NZ~ tt+seas)
summary(mod2)
AIC(mod2)
```

Linear model with no seasonality

```{r}
mod2a <- lm(NZ~ tt)
summary(mod2a)
```

we now try with a GAM

```{r}
# Values for df should be greater than 1, with df=1 implying a linear fit. Default is df=4
g1 <- gam(NZ~s(tt)+seas)
summary(g1)

par(mfrow=c(1,2))
plot(g1, se=T)
AIC(g1)

# time has a nonlinear effect
g1a <- gam(NZ~s(tt))
par(mfrow=c(1,2))
plot(g1a, se=T)
summary(g1a)

# linear model may be also performed with library(gam)
g0 <- gam(NZ~(tt)+seas)
summary(g0)
par(mfrow=c(1,2))
plot(g0, se=T)
AIC(g0)
```

GAM with splines performs better

```{r}
# try another option with loess (lo)
g2<- gam(NZ~lo(tt)+seas)
summary(g2)
par(mfrow=c(1,2))
plot(g2, se=T)
AIC(g2)

# perform analysis of residuals
tsdisplay(residuals(g1))
aar1<- auto.arima(residuals(g1))

plot(as.numeric(NZ), type='l')
lines(fitted(g1), col=3)
lines(fitted(aar1)+ fitted(g1), col=4)

# Exercise: try also with the other variables 
```

# Prophet

iPhone sales

```{r}
# data 
iphone=read.csv('iphone.csv', sep=';')
str(iphone)

# some preliminary analysis on the data
iphone$quarter=as.Date(iphone$quarter, format='%d/%m/%Y')
str(iphone)
colnames(iphone)=c('y','ds','cap')

summary(iphone$y) ##to explain why cap is 100

#plot
plot(iphone$y, type='l', x=iphone$ds, ylab='', xlab='Year') 
# nonlinear trend 
# seasonality
```

```{r}
# Prophet model
# model with no seasonality
# m2=prophet(iphone, yearly.seasonality=F, daily.seasonality=F, weekly.seasonality = F, growth='logistic', n.changepoints=15)


# model with automatic selection of seasonality (default is 'auto')
# in this case just yearly seasonality, if daily and weekly seasonsonality are needed we need to specify
# changepoints not specified, automatically set
m2=prophet(iphone,  growth='logistic') 

# n.changepoints default number is 25
summary(m2)
```

```{r}
# create a future 'window' for prediction
future2 <- make_future_dataframe(m2, periods = 20, freq='quarter', include_history = T)
tail(future2)
future2$cap=100

forecast2 <- predict(m2, future2)
tail(forecast2[c('ds', 'yhat', 'yhat_lower', 'yhat_upper')])

# prediction plot
plot(m2, forecast2)

# Dynamic plot
dyplot.prophet(m2, forecast2) 

# plot with change points
plot(m2, forecast2)+add_changepoints_to_plot(m2, threshold=0)
```

```{r}
# dates corresponding to change points
m2$changepoints

# Prophet with no change points and multiplicative seasonality 
m2=prophet(iphone,  growth='logistic', n.changepoints=0, seasonality.mode='multiplicative')

# summary(m2)
# m2$seasonalities 
future2 <- make_future_dataframe(m2, periods = 20, freq='quarter', include_history = T)
tail(future2)
future2$cap=100

forecast2 <- predict(m2, future2)
tail(forecast2[c('ds', 'yhat', 'yhat_lower', 'yhat_upper')])

# prediction plot
plot(m2, forecast2)
dyplot.prophet(m2, forecast2) 
```

# Generalized Additive Models

'Movies' Dataset

```{r}
data <- read.csv('movies.csv', stringsAsFactors=TRUE)
str(data)
# erase columns of indicator variables
data<-data[,-c(1,2)]

# transform variable release_date in format 'data'
data$release_date <- as.Date(data$release_date, '%d/%m/%Y')

str(data)

# Response variable
summary(data$vote_average)
#boxplot(data$vote_average, col='orange', ylim=c(0,10), main='Movies', ylab='Rating')
hist(data$vote_average, col='orange', main='Movies', xlab='Rating')
# explanatory variables
summary(data)
```

```{r}
summary(data[,c(1,2,5,6)])
par(mfrow=c(2,2))
for(i in c(1,2,5,6)){
  hist(data[,i], col='orange', main=paste(colnames(data)[i]), xlab='')
}

#transform quantitative variables in log scale
data$budget <- log(data$budget)
data$popularity <- log(data$popularity)
data$revenue <- log(data$revenue)

summary(data[,c(1,2,5,6)])
```

```{r}
par(mfrow=c(2,2))
for(i in c(1,2,5,6)){
  hist(data[,i], col='orange', main=paste(colnames(data)[i]), xlab='')
}
#go back to orginal panel 
par(mfrow=c(1,1))

#transform release_date in numeric 
data$release_date<-as.numeric(data$release_date)
summary(data$release_date)

#as.numeric(as.Date('1970-01-01'))


# Set train and test
set.seed(1)
train = sample (1:nrow(data), 0.7*nrow(data))
data.train=data[train ,]
data.test=data[-train ,]

# make some variables factor
data.train[,c(3,7, 10:24)]= lapply(data.train[,c(3,7, 10:24)],factor)
data.test[,c(3,7, 10:24)]= lapply(data.test[,c(3,7, 10:24)],factor)

str(data.train)
```

# Linear Model

```{r}
m1 <- lm(vote_average~.-vote_classes, data=data.train)

summary(m1)

# Stepwise Regression
m2 <- step(m1, direction='both')
summary(m2)

#Prediction
p.lm <- predict(m2, newdata=data.test)
dev.lm <- sum((p.lm-data.test$vote_average)^2)
dev.lm

AIC(m2)
```

# GAM

```{r}
# Stepwise GAM
# Start with a linear model (df=1)
g3 <- gam(vote_average~.-vote_classes, data=data.train)

# Show the linear effects 
par(mfrow=c(3,5))
plot(g3, se=T) 

# Perform stepwise selection using gam scope
# Values for df should be greater than 1, with df=1 implying a linear fit

sc = gam.scope(data.train[,-8], response=8, arg=c('df=2','df=3','df=4'))
g4<- step.Gam(g3, scope=sc, trace=T)
summary(g4)

AIC(g4)
```

```{r}
par(mfrow=c(3,5))
plot(g4, se=T)

# if we want to see better some plot
par(mfrow=c(1,1))
plot(g4, se=T, ask=T)

# Prediction
data.test[,c(3,7, 10:24)]= lapply(data.test[,c(3,7, 10:24)],factor)
p.gam <- predict(g4,newdata=data.test)     
dev.gam <- sum((p.gam-data.test$vote_average)^2)
dev.gam
```

# Gradient Boosting

Boosting-

```{r}
boost.movies=gbm(vote_average ~ .-vote_classes, data=data.train, 
                 distribution='gaussian', n.trees=5000, interaction.depth=1)

# for the plot
par(mfrow=c(1,1))

# plot of training error
plot(boost.movies$train.error, type='l', ylab='training error')
```

```{r}
# always decreasing with increasing number of trees
# relative influence plot
summary(boost.movies) 
```

```{r}
# let us modify the graphical parameters to obtain a better plot 1re space on the left
# default vector of parameters
mai.old<-par()$mai
mai.old
# new vector
mai.new<-mai.old
# new space on the left
mai.new[2] <- 2.5 
mai.new
# modify graphical parameters
par(mai=mai.new)
summary(boost.movies, las=1) 
# las=1 horizontal names on y
summary(boost.movies, las=1, cBar=10) 
# cBar defines how many variables
# back to orginal window
par(mai=mai.old)
```

```{r}
# test set prediction for every iteration (1:5000)
yhat.boost=predict(boost.movies, newdata=data.test, n.trees=1:5000)

# calculate the error for each iteration
# use 'apply' to perform a 'cycle for' 
# the first element is the matrix we want to use, 2 means 'by column', 
# and the third element indicates the function we want to calculate

err = apply(yhat.boost, 2, function(pred) mean((data.test$vote_average - pred)^2))
plot(err, type='l')

# error comparison (train e test)
plot(boost.movies$train.error, type='l')
lines(err, type='l', col=2)
# minimum error in test set
best=which.min(err)
abline(v=best, lty=2, col=4)
min(err) #minimum error
```

Boosting - Deeper trees

```{r}
boost.movies=gbm(vote_average ~ .-vote_classes, data=data.train, 
                 distribution='gaussian', n.trees=5000, interaction.depth=4)

plot(boost.movies$train.error, type='l')
summary(boost.movies, las=1, cBar=10)  
```

```{r}
yhat.boost=predict(boost.movies ,newdata=data.test,n.trees=1:5000)
err = apply(yhat.boost,2,function(pred) mean((data.test$vote_average-pred)^2))
plot(err, type='l')


plot(boost.movies$train.error, type='l')
lines(err, type='l', col=2)
best=which.min(err)
abline(v=best, lty=2, col=4)
min(err)
```

Boosting - Smaller learning rate

```{r}
boost.movies=gbm(vote_average ~ .-vote_classes, data=data.train, 
                 distribution='gaussian', n.trees=5000, interaction.depth=1, shrinkage=0.01)
plot(boost.movies$train.error, type='l')

par(mai=mai.new)

summary(boost.movies, las=1, cBar=10) 
par(mai=mai.old)
```

```{r}
yhat.boost=predict(boost.movies ,newdata=data.test,n.trees=1:5000)
err = apply(yhat.boost,2,function(pred) mean((data.test$vote_average-pred)^2))
plot(err, type='l')


plot(boost.movies$train.error, type='l')
lines(err, type='l', col=2)
best=which.min(err)
abline(v=best, lty=2, col=4)
min(err)
```

Boosting - combination of previous models

```{r}
boost.movies=gbm(vote_average ~ .-vote_classes ,data=data.train, 
                 distribution='gaussian',n.trees=5000, interaction.depth=4, shrinkage=0.01)

plot(boost.movies$train.error, type='l')
#

par(mai=mai.new)

summary(boost.movies, las=1, cBar=10) 

par(mai=mai.old)
```

```{r}
yhat.boost=predict(boost.movies ,newdata=data.test,n.trees=1:5000)
err = apply(yhat.boost, 2, function(pred) mean((data.test$vote_average-pred)^2))
plot(err, type='l')


plot(boost.movies$train.error, type='l')
lines(err, type='l', col=2)
best=which.min(err)
abline(v=best, lty=2, col=4)
err.boost= min(err)
```

```{r}
# Comparison of models in terms of residual deviance
dev.gbm<- (sum((yhat.boost[,best]-data.test$vote_average)^2))
# dev.gbm
# dev.gam
# dev.lm

boost.movies
# partial dependence plots
plot(boost.movies, i.var=1, n.trees = best)
plot(boost.movies, i.var=2, n.trees = best)
plot(boost.movies, i.var=5, n.trees = best)
plot(boost.movies, i.var=c(1,5), n.trees = best) #bivariate (library(viridis) may be necessary)
#
plot(boost.movies, i.var=3, n.trees = best) # categorical
plot(boost.movies, i.var=6, n.trees = best)

plot(boost.movies, i=23, n.trees = best) # categorical
plot(boost.movies, i=17, n.trees = best) #no effect
```
